Impact of Anchoring

Author

K. Grinde

Published

August 1, 2024

Setup

Packages

Show the code
library(GENESIS) # for example data, mixed models, etc.
library(SNPRelate) # for PCA, LD pruning, etc.
library(SeqArray) # working with gds files
library(SeqVarTools) # for PC-Relate setup
library(Biobase) # working with sample annotation files
library(dplyr) # tidyverse data tools
library(tidyr) # tidyverse data tools
library(ggplot2) # plotting
library(GGally) # plotting
library(RColorBrewer) # plotting (color schemes)
library(gridExtra) # plotting (multiple panels)
#devtools::install_github('UW-GAC/analysis_pipeline/TopmedPipeline')
library(TopmedPipeline)

Data

Download a subset of 1000 Genomes data here

Show the code
# genotypes
gdsfmt::showfile.gds(closeall=TRUE) # make sure files are not already open
gdsfile <- '../../data/1KG_phase3_subset.gds'
gds <- seqOpen(gdsfile)

# sample annotation
annotfile <- '../../data/1KG_phase3_subset_annot.RData'
annot <- getobj(annotfile)
#head(pData(annot))

Identify Admixed Individuals

We’ll focus on the “ASW” population (African Ancestry in Southwest US). The anchored analysis will include CEU (Northern European from Utah) and YRI (Yoruba) as the reference individuals.

Show the code
get_ids <- function(annot, popid){
  pData(annot) %>%
    filter(Population == popid) %>%
    pull(sample.id)
}
asw.ids <- get_ids(annot, 'ASW') # African Ancestry in Southwest US
ceu.ids <- get_ids(annot, 'CEU')
yri.ids <- get_ids(annot, 'YRI')
anchor.ids <- c(asw.ids, ceu.ids, yri.ids) # for anchored analysis

Admixed Only Analysis

Show the code
#### find unrelated samples ####
## step 1: LD pruning ####
set.seed(100)
snpset <- snpgdsLDpruning(gds, sample.id = asw.ids, 
                          method = 'corr', slide.max.bp=10e6, ld.threshold = sqrt(0.1))
SNV pruning based on LD:
Excluding 1,120 SNVs on non-autosomes
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
# of selected variants: 9,919
Excluding 14,721 SNVs (monomorphic: TRUE, MAF: 0.005, missing rate: 0.05)
    # of samples: 61
    # of SNVs: 9,919
    using 1 thread
    sliding window: 10,000,000 basepairs, Inf SNPs
    |LD| threshold: 0.316228
    method: correlation
Chrom 1: |====================|====================|
    27.05%, 303 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 2: |====================|====================|
    27.59%, 309 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 3: |====================|====================|
    25.36%, 284 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 4: |====================|====================|
    26.16%, 293 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 5: |====================|====================|
    25.18%, 282 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 6: |====================|====================|
    25.36%, 284 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 7: |====================|====================|
    23.30%, 261 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 8: |====================|====================|
    23.21%, 260 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 9: |====================|====================|
    22.50%, 252 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 10: |====================|====================|
    24.46%, 274 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 11: |====================|====================|
    22.50%, 252 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 12: |====================|====================|
    23.48%, 263 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 13: |====================|====================|
    21.34%, 239 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 14: |====================|====================|
    20.18%, 226 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 15: |====================|====================|
    17.95%, 201 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 16: |====================|====================|
    18.75%, 210 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 17: |====================|====================|
    18.57%, 208 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 18: |====================|====================|
    20.09%, 225 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 19: |====================|====================|
    17.23%, 193 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 20: |====================|====================|
    17.86%, 200 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 21: |====================|====================|
    13.57%, 152 / 1,120 (Tue Aug  6 13:47:26 2024)
Chrom 22: |====================|====================|
    12.86%, 144 / 1,120 (Tue Aug  6 13:47:26 2024)
5,315 markers are selected in total.
Show the code
#sapply(snpset, length)
pruned <- unlist(snpset, use.names = FALSE)

## step 2: KING ####
king <- snpgdsIBDKING(gds, sample.id = asw.ids, snp.id = pruned)
IBD analysis (KING method of moment) on genotypes:
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 1s
# of selected variants: 5,315
    # of samples: 61
    # of SNVs: 5,315
    using 1 thread
No family is specified, and all individuals are treated as singletons.
Relationship inference in the presence of population stratification.
CPU capabilities: Double-Precision SSE2
Tue Aug  6 13:47:27 2024    (internal increment: 65536)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:27 2024    Done.
Show the code
kingMat <- king$kinship
colnames(kingMat) <- rownames(kingMat) <- king$sample.id

# extract pairwise kinship estimates and IBS0 to plot
kinship <- snpgdsIBDSelection(king)

# plot
ggplot(kinship, aes(IBS0, kinship)) +
  geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
  geom_point(alpha=0.5) +
  ylab("kinship estimate") +
  theme_bw()

Show the code
## step 3: PC-AiR ####
# partition samples into related and unrelated
sampset <- pcairPartition(kinobj=kingMat, kin.thresh=2^(-9/2),
                          divobj=kingMat, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 61 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Show the code
sapply(sampset, length)
  rels unrels 
     9     52 
Show the code
# run PCA on unrelated
pca.unrel <- snpgdsPCA(gds, sample.id=sampset$unrels, snp.id=pruned)
Principal Component Analysis (PCA) on genotypes:
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
# of selected variants: 5,154
Excluding 161 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
    # of samples: 52
    # of SNVs: 5,154
    using 1 thread
    # of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug  6 13:47:27 2024    (internal increment: 18784)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:27 2024    Begin (eigenvalues and eigenvectors)
Tue Aug  6 13:47:27 2024    Done.
Show the code
# project values for relatives
snp.load <- snpgdsPCASNPLoading(pca.unrel, gdsobj=gds)
SNP Loading:
    # of samples: 52
    # of SNPs: 5,154
    using 1 thread
    using the top 32 eigenvectors
Tue Aug  6 13:47:27 2024    (internal increment: 65536)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:27 2024    Done.
Show the code
samp.load <- snpgdsPCASampLoading(snp.load, gdsobj=gds, sample.id=sampset$rels)
Sample Loading:
    # of samples: 9
    # of SNPs: 5,154
    using 1 thread
    using the top 32 eigenvectors
Tue Aug  6 13:47:27 2024    (internal increment: 65536)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:27 2024    Done.
Show the code
# combine unrelated and related PCs and order as in ASW ID list
pcs <- rbind(pca.unrel$eigenvect, samp.load$eigenvect)
rownames(pcs) <- c(pca.unrel$sample.id, samp.load$sample.id)
samp.ord <- match(asw.ids, rownames(pcs))
pcs <- pcs[samp.ord,]

## step 3b: determine which PCs are ancestry-informative ####
## (make a parallel coordinates plot)

# merge data with sample annotation info
pc.df <- as.data.frame(pcs)
names(pc.df) <- 1:ncol(pcs)
pc.df$sample.id <- row.names(pcs)
pc.df <- left_join(pc.df, pData(annot), by='sample.id')

# plot
pop.cols <- setNames(brewer.pal(12, "Paired"),
                     c("ACB", "ASW", "CEU", "GBR", "CHB", "JPT", "CLM", "MXL", "LWK", "YRI", "GIH", "PUR"))
ggparcoord(pc.df, columns=1:12, groupColumn="Population", scale="uniminmax") +
  scale_color_manual(values=pop.cols) +
  xlab("PC") + ylab("")

Show the code
p1 <- ggplot(pc.df, aes(`1`, `2`, color = Population)) + 
  geom_point() + 
  scale_color_manual(values = pop.cols) + 
  labs(x = 'PC1', y = 'PC2')
p2 <- ggplot(pc.df, aes(`3`, `4`, color = Population)) + 
  geom_point() + 
  scale_color_manual(values = pop.cols) + 
  labs(x = 'PC3', y = 'PC4')
p3 <- ggplot(pc.df, aes(`5`, `6`, color = Population)) + 
  geom_point() + 
  scale_color_manual(values = pop.cols) + 
  labs(x = 'PC5', y = 'PC6')
grid.arrange(p1,p2,p3)

Show the code
## step 4: PC-Relate ####
# SeqVarIterator setup
seqResetFilter(gds)
# of selected samples: 1,126
# of selected variants: 25,760
Show the code
seqData <- SeqVarData(gds)
seqSetFilter(seqData, variant.id = pruned, sample.id = asw.ids)
# of selected samples: 61
# of selected variants: 5,315
Show the code
iterator <- SeqVarBlockIterator(seqData, verbose = F)

# run pcrelate
pcrel <- pcrelate(iterator, pcs = pcs[,1:2], training.set = sampset$unrels,
                  sample.include = asw.ids)
Using 6 CPU cores
61 samples to be included in the analysis...
Betas for 2 PC(s) will be calculated using 52 samples in training.set...
Running PC-Relate analysis for 61 samples using 5315 SNPs in 1 blocks...
Performing Small Sample Correction...
Show the code
#names(pcrel)

# make kinship matrix from PC-Relate results
pcrelMat <- pcrelateToMatrix(pcrel, scaleKin=1, verbose=FALSE)

# run PC-AiR again
seqResetFilter(seqData, verbose=FALSE)
pca <- pcair(seqData,
             kinobj=pcrelMat, kin.thresh=2^(-9/2),
             divobj=kingMat, div.thresh=-2^(-9/2),
             sample.include=asw.ids, snp.include=pruned,
             verbose=FALSE)
#names(pca)

# plot
pcs <- pca$vectors
pc.df <- as.data.frame(pcs)
names(pc.df) <- paste0("PC", 1:ncol(pcs))
pc.df$sample.id <- row.names(pcs)
pc.df <- left_join(pc.df, pData(annot), by="sample.id")

ggplot(pc.df, aes(PC1, PC2, color=Population)) + geom_point() +
  scale_color_manual(values=pop.cols)

Show the code
# use revised PCs to get new kinship estimates
seqSetFilter(seqData, variant.id=pruned)
# of selected variants: 5,315
Show the code
iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)
pcrel <- pcrelate(iterator, pcs=pcs[,1:2], training.set=pca$unrels, 
                  sample.include=asw.ids)
Using 6 CPU cores
61 samples to be included in the analysis...
Betas for 2 PC(s) will be calculated using 53 samples in training.set...
Running PC-Relate analysis for 61 samples using 5315 SNPs in 1 blocks...
Performing Small Sample Correction...
Show the code
# plot new kinship estimates
kinship <- pcrel$kinBtwn

ggplot(kinship, aes(k0, kin)) +
  geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
  geom_point(alpha=0.5) +
  ylab("kinship estimate") +
  theme_bw()

Show the code
# get new partition
pcrelMat <- pcrelateToMatrix(pcrel, scaleKin=1, verbose=FALSE)
sampset2 <- pcairPartition(kinobj=pcrelMat, kin.thresh=2^(-9/2),
                           divobj=kingMat, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 61 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Show the code
asw.unrel.ids <- sampset2$unrels


#### run "naive" PCA ####
pca.naive <- snpgdsPCA(gds, sample.id = asw.unrel.ids)
Principal Component Analysis (PCA) on genotypes:
Excluding 1,120 SNVs on non-autosomes
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
# of selected variants: 9,665
Excluding 14,975 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
    # of samples: 53
    # of SNVs: 9,665
    using 1 thread
    # of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug  6 13:47:32 2024    (internal increment: 18432)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:32 2024    Begin (eigenvalues and eigenvectors)
Tue Aug  6 13:47:32 2024    Done.
Show the code
# calculate SNP loadings
snp.load <- snpgdsPCASNPLoading(pca.naive, gdsobj=gds)
SNP Loading:
    # of samples: 53
    # of SNPs: 9,665
    using 1 thread
    using the top 32 eigenvectors
Tue Aug  6 13:47:32 2024    (internal increment: 65536)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:32 2024    Done.
Show the code
# plot set-up
snp.load.df <- as.data.frame(cbind(snp.load$snp.id,
                                   t(snp.load$snploading)))
names(snp.load.df) <- c('snp.id', paste('PC',1:32,sep=''))
missing <- !(readex.gdsn(index.gdsn(gds, 'variant.id')) %in% snp.load$snp.id)
snp.annot <- data.frame(snp.id = readex.gdsn(index.gdsn(gds, 'variant.id')),
                        chr = readex.gdsn(index.gdsn(gds, 'chromosome')),
                        pos = readex.gdsn(index.gdsn(gds, 'position')),
                        stringsAsFactors = F)
snp.load.df <- left_join(snp.load.df, snp.annot, by = 'snp.id')
snp.load.df.long <- snp.load.df %>%
  pivot_longer(cols=PC1:PC32, names_to = 'PC') %>%
  mutate(PC = factor(PC, levels = paste0("PC", 1:32))) %>%
  mutate(chr = factor(chr, levels = c(1:22, "X")))

# set up color scheme
chr <- levels(snp.load.df.long$chr)
cmap <- setNames(rep_len(brewer.pal(8, "Dark2"), length(chr)), chr)

# plot first 4 PCs page
n_pcs <- length(unique(snp.load.df.long$PC))
n_plots <- ceiling(n_pcs/as.integer(4)) # change 4 depending on how many plots per page you want
bins <- as.integer(cut(1:n_pcs, n_plots))
i <- 1
bin <- paste0("PC", which(bins == i))
dat <- filter(snp.load.df.long, PC %in% bin)
ggplot(dat, aes(chr, value, group=interaction(chr, pos), color=chr)) +
  geom_point(position=position_dodge(0.8)) +
  facet_wrap(~PC, scales="free", ncol=1) +
  scale_color_manual(values=cmap, breaks=names(cmap)) +
  #ylim(0,1) +
  ylim(-0.05,0.05)+
  theme_bw() +
  theme(legend.position="none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  xlab("chromosome") + ylab("loading")
Warning: Removed 74 rows containing missing values or values outside the scale range
(`geom_point()`).

Anchored Analysis

Show the code
## step 1: LD pruning ####
set.seed(100)
snpset.anchored <- snpgdsLDpruning(gds, sample.id = c(asw.ids, ceu.ids, yri.ids), 
                          method = 'corr', slide.max.bp=10e6, ld.threshold = sqrt(0.1))
SNV pruning based on LD:
Excluding 1,120 SNVs on non-autosomes
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
# of selected variants: 9,179
Excluding 15,461 SNVs (monomorphic: TRUE, MAF: 0.005, missing rate: 0.05)
    # of samples: 268
    # of SNVs: 9,179
    using 1 thread
    sliding window: 10,000,000 basepairs, Inf SNPs
    |LD| threshold: 0.316228
    method: correlation
Chrom 1: |====================|====================|
    35.89%, 402 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 2: |====================|====================|
    34.29%, 384 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 3: |====================|====================|
    33.21%, 372 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 4: |====================|====================|
    34.55%, 387 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 5: |====================|====================|
    33.04%, 370 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 6: |====================|====================|
    34.20%, 383 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 7: |====================|====================|
    32.41%, 363 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 8: |====================|====================|
    32.59%, 365 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 9: |====================|====================|
    35.45%, 397 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 10: |====================|====================|
    33.57%, 376 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 11: |====================|====================|
    33.93%, 380 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 12: |====================|====================|
    32.32%, 362 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 13: |====================|====================|
    34.55%, 387 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 14: |====================|====================|
    31.61%, 354 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 15: |====================|====================|
    29.73%, 333 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 16: |====================|====================|
    31.88%, 357 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 17: |====================|====================|
    30.89%, 346 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 18: |====================|====================|
    33.75%, 378 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 19: |====================|====================|
    32.32%, 362 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 20: |====================|====================|
    30.89%, 346 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 21: |====================|====================|
    29.73%, 333 / 1,120 (Tue Aug  6 13:47:36 2024)
Chrom 22: |====================|====================|
    29.02%, 325 / 1,120 (Tue Aug  6 13:47:36 2024)
8,062 markers are selected in total.
Show the code
pruned.anchored <- unlist(snpset.anchored, use.names = FALSE)

## step 2: KING ####
king.anchored <- snpgdsIBDKING(gds, sample.id = c(asw.ids, ceu.ids, yri.ids), 
                               snp.id = pruned.anchored)
IBD analysis (KING method of moment) on genotypes:
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 1s
# of selected variants: 8,062
    # of samples: 268
    # of SNVs: 8,062
    using 1 thread
No family is specified, and all individuals are treated as singletons.
Relationship inference in the presence of population stratification.
CPU capabilities: Double-Precision SSE2
Tue Aug  6 13:47:37 2024    (internal increment: 65536)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:37 2024    Done.
Show the code
kingMat.anchored <- king.anchored$kinship
colnames(kingMat.anchored) <- rownames(kingMat.anchored) <- king.anchored$sample.id

# extract pairwise kinship estimates and IBS0 to plot
kinship.anchored <- snpgdsIBDSelection(king.anchored)

# plot
ggplot(kinship.anchored, aes(IBS0, kinship)) +
  geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
  geom_point(alpha=0.5) +
  ylab("kinship estimate") +
  theme_bw()

Show the code
## step 3: PC-AiR ####
# partition samples into related and unrelated
sampset.anchored <- pcairPartition(kinobj=kingMat.anchored, 
                                   kin.thresh=2^(-9/2),
                                   divobj=kingMat.anchored, 
                                   div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 268 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Show the code
sapply(sampset.anchored, length)
  rels unrels 
    18    250 
Show the code
# run PCA on unrelated
pca.unrel.anchored <- snpgdsPCA(gds, 
                                sample.id=sampset.anchored$unrels, 
                                snp.id=pruned.anchored)
Principal Component Analysis (PCA) on genotypes:
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
# of selected variants: 8,062
    # of samples: 250
    # of SNVs: 8,062
    using 1 thread
    # of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug  6 13:47:38 2024    (internal increment: 3904)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:38 2024    Begin (eigenvalues and eigenvectors)
Tue Aug  6 13:47:38 2024    Done.
Show the code
# project values for relatives
snp.load.anchored <- snpgdsPCASNPLoading(pca.unrel.anchored, 
                                         gdsobj=gds)
SNP Loading:
    # of samples: 250
    # of SNPs: 8,062
    using 1 thread
    using the top 32 eigenvectors
Tue Aug  6 13:47:38 2024    (internal increment: 31260)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:38 2024    Done.
Show the code
samp.load.anchored <- snpgdsPCASampLoading(snp.load.anchored, 
                                  gdsobj=gds, 
                                  sample.id=sampset.anchored$rels)
Sample Loading:
    # of samples: 18
    # of SNPs: 8,062
    using 1 thread
    using the top 32 eigenvectors
Tue Aug  6 13:47:38 2024    (internal increment: 65536)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:38 2024    Done.
Show the code
# combine unrelated and related PCs and order as in ASW ID list
pcs.anchored <- rbind(pca.unrel.anchored$eigenvect, 
                      samp.load.anchored$eigenvect)
rownames(pcs.anchored) <- c(pca.unrel.anchored$sample.id, 
                            samp.load.anchored$sample.id)
samp.ord <- match(c(asw.ids, ceu.ids, yri.ids), 
                  rownames(pcs.anchored))
pcs.anchored <- pcs.anchored[samp.ord,]

## step 3b: determine which PCs are ancestry-informative ####
## (make a parallel coordinates plot)

# merge data with sample annotation info
pc.df.anchored <- as.data.frame(pcs.anchored)
names(pc.df.anchored) <- 1:ncol(pcs.anchored)
pc.df.anchored$sample.id <- row.names(pcs.anchored)
pc.df.anchored <- left_join(pc.df.anchored, pData(annot), by='sample.id')

# plot
ggparcoord(pc.df.anchored, 
           columns=1:12, 
           groupColumn="Population", 
           scale="uniminmax") +
  scale_color_manual(values=pop.cols) +
  xlab("PC") + ylab("")

Show the code
p1 <- ggplot(pc.df.anchored, aes(`1`, `2`, color = Population)) + 
  geom_point() + 
  scale_color_manual(values = pop.cols) + 
  labs(x = 'PC1', y = 'PC2')
p2 <- ggplot(pc.df.anchored, aes(`3`, `4`, color = Population)) + 
  geom_point() + 
  scale_color_manual(values = pop.cols) + 
  labs(x = 'PC3', y = 'PC4')
p3 <- ggplot(pc.df.anchored, aes(`5`, `6`, color = Population)) + 
  geom_point() + 
  scale_color_manual(values = pop.cols) + 
  labs(x = 'PC5', y = 'PC6')
grid.arrange(p1,p2,p3)

Show the code
## step 4: PC-Relate ####
# SeqVarIterator setup
seqResetFilter(gds)
# of selected samples: 1,126
# of selected variants: 25,760
Show the code
seqData <- SeqVarData(gds)
seqSetFilter(seqData, variant.id = pruned.anchored, 
             sample.id = c(asw.ids, ceu.ids, yri.ids))
# of selected samples: 268
# of selected variants: 8,062
Show the code
iterator <- SeqVarBlockIterator(seqData, verbose = F)

# run pcrelate
pcrel.anchored <- pcrelate(iterator, pcs = pcs.anchored[,1:2], 
                           training.set = sampset.anchored$unrels,
                  sample.include = c(asw.ids, ceu.ids, yri.ids))
Using 6 CPU cores
268 samples to be included in the analysis...
Betas for 2 PC(s) will be calculated using 250 samples in training.set...
Running PC-Relate analysis for 268 samples using 8062 SNPs in 1 blocks...
Performing Small Sample Correction...
Show the code
#names(pcrel)

# make kinship matrix from PC-Relate results
pcrelMat.anchored <- pcrelateToMatrix(pcrel.anchored, 
                                      scaleKin=1, verbose=FALSE)

# run PC-AiR again
seqResetFilter(seqData, verbose=FALSE)
pca.anchored <- pcair(seqData,
             kinobj=pcrelMat.anchored, kin.thresh=2^(-9/2),
             divobj=kingMat.anchored, div.thresh=-2^(-9/2),
             sample.include=c(asw.ids, ceu.ids, yri.ids), 
             snp.include=pruned.anchored,
             verbose=FALSE)
#names(pca)

# plot
pcs.anchored <- pca.anchored$vectors
pc.df.anchored <- as.data.frame(pcs.anchored)
names(pc.df.anchored) <- paste0("PC", 1:ncol(pcs.anchored))
pc.df.anchored$sample.id <- row.names(pcs.anchored)
pc.df.anchored <- left_join(pc.df.anchored, pData(annot), 
                            by="sample.id")

ggplot(pc.df.anchored, aes(PC1, PC2, color=Population)) + 
  geom_point() +
  scale_color_manual(values=pop.cols)

Show the code
# use revised PCs to get new kinship estimates
seqSetFilter(seqData, variant.id=pruned.anchored)
# of selected variants: 8,062
Show the code
iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)
pcrel.anchored <- pcrelate(iterator, pcs=pcs.anchored[,1:2], 
                           training.set=pca.anchored$unrels, 
                  sample.include=c(asw.ids, ceu.ids, yri.ids))
Using 6 CPU cores
268 samples to be included in the analysis...
Betas for 2 PC(s) will be calculated using 259 samples in training.set...
Running PC-Relate analysis for 268 samples using 8062 SNPs in 1 blocks...
Performing Small Sample Correction...
Show the code
# plot new kinship estimates
kinship.anchored <- pcrel.anchored$kinBtwn

ggplot(kinship.anchored, aes(k0, kin)) +
  geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
  geom_point(alpha=0.5) +
  ylab("kinship estimate") +
  theme_bw()

Show the code
# get new partition
pcrelMat.anchored <- pcrelateToMatrix(pcrel.anchored, scaleKin=1, verbose=FALSE)
sampset2.anchored <- pcairPartition(kinobj=pcrelMat.anchored, 
                                    kin.thresh=2^(-9/2),
                           divobj=kingMat.anchored, div.thresh=-2^(-9/2))
Using kinobj and divobj to partition samples into unrelated and related sets
Working with 268 samples
Identifying relatives for each sample using kinship threshold 0.0441941738241592
Identifying pairs of divergent samples using divergence threshold -0.0441941738241592
Partitioning samples into unrelated and related sets...
Show the code
unrel.ids.anchored <- sampset2.anchored$unrels

#### run "naive" PCA ####
pca.naive.anchored <- snpgdsPCA(gds, sample.id = unrel.ids.anchored)
Principal Component Analysis (PCA) on genotypes:
Excluding 1,120 SNVs on non-autosomes
Calculating allele counts/frequencies ...

[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
# of selected variants: 14,028
Excluding 10,612 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
    # of samples: 259
    # of SNVs: 14,028
    using 1 thread
    # of principal components: 32
CPU capabilities: Double-Precision SSE2
Tue Aug  6 13:47:59 2024    (internal increment: 3768)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:59 2024    Begin (eigenvalues and eigenvectors)
Tue Aug  6 13:47:59 2024    Done.
Show the code
# calculate SNP loadings
snp.load.anchored <- snpgdsPCASNPLoading(pca.naive.anchored, gdsobj=gds)
SNP Loading:
    # of samples: 259
    # of SNPs: 14,028
    using 1 thread
    using the top 32 eigenvectors
Tue Aug  6 13:47:59 2024    (internal increment: 30172)

[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
Tue Aug  6 13:47:59 2024    Done.
Show the code
# plot set-up
snp.load.df.anchored <- as.data.frame(cbind(snp.load.anchored$snp.id,
                                   t(snp.load.anchored$snploading)))
names(snp.load.df.anchored) <- c('snp.id', paste('PC',1:32,sep=''))
missing <- !(readex.gdsn(index.gdsn(gds, 'variant.id')) %in% snp.load.anchored$snp.id)
snp.annot.anchored <- data.frame(snp.id = readex.gdsn(index.gdsn(gds, 'variant.id')),
                        chr = readex.gdsn(index.gdsn(gds, 'chromosome')),
                        pos = readex.gdsn(index.gdsn(gds, 'position')),
                        stringsAsFactors = F)
snp.load.df.anchored <- left_join(snp.load.df.anchored, snp.annot.anchored, by = 'snp.id')
snp.load.df.long.anchored <- snp.load.df.anchored %>%
  pivot_longer(cols=PC1:PC32, names_to = 'PC') %>%
  mutate(PC = factor(PC, levels = paste0("PC", 1:32))) %>%
  mutate(chr = factor(chr, levels = c(1:22, "X")))

# set up color scheme
chr <- levels(snp.load.df.long.anchored$chr)
cmap <- setNames(rep_len(brewer.pal(8, "Dark2"), length(chr)), chr)

# plot first 4 PCs page
n_pcs.anchored <- length(unique(snp.load.df.long.anchored$PC))
n_plots.anchored <- ceiling(n_pcs.anchored/as.integer(4)) # change 4 depending on how many plots per page you want
bins.anchored <- as.integer(cut(1:n_pcs.anchored, n_plots.anchored))
i <- 1
bin <- paste0("PC", which(bins == i))
dat.anchored <- filter(snp.load.df.long.anchored, PC %in% bin)
ggplot(dat.anchored, aes(chr, value, group=interaction(chr, pos), color=chr)) +
  geom_point(position=position_dodge(0.8)) +
  facet_wrap(~PC, scales="free", ncol=1) +
  scale_color_manual(values=cmap, breaks=names(cmap)) +
  #ylim(0,1) +
  ylim(-0.05,0.05)+
  theme_bw() +
  theme(legend.position="none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  xlab("chromosome") + ylab("loading")
Warning: Removed 143 rows containing missing values or values outside the scale range
(`geom_point()`).

Compare Results

Compare pruned variants:

# compare to ASW only analysis
length(pruned)
[1] 5315
length(pruned.anchored) 
[1] 8062

Compare SNP loadings:

dim(snp.load.df.anchored)
[1] 14028    35
dim(snp.load.df)
[1] 9665   35
snp.load.overlap <- full_join(snp.load.df.anchored,
                              snp.load.df,
                              by = 'snp.id')
snp.load.overlap %>%
  select(PC1.x, PC1.y) %>%
  cor(use = 'complete.obs')
           PC1.x      PC1.y
PC1.x  1.0000000 -0.4649241
PC1.y -0.4649241  1.0000000
snp.load.overlap %>%
  select(PC2.x, PC2.y) %>%
  cor(use = 'complete.obs')
         PC2.x    PC2.y
PC2.x 1.000000 0.809642
PC2.y 0.809642 1.000000
snp.load.overlap %>%
  select(PC3.x, PC3.y) %>%
  cor(use = 'complete.obs')
           PC3.x      PC3.y
PC3.x 1.00000000 0.05959983
PC3.y 0.05959983 1.00000000
snp.load.overlap %>%
  select(PC4.x, PC4.y) %>%
  cor(use = 'complete.obs')
          PC4.x     PC4.y
PC4.x  1.000000 -0.308079
PC4.y -0.308079  1.000000
snp.load.overlap %>%
  select(PC1.x, PC1.y, PC2.x, PC2.y, PC3.x, PC3.y, PC4.x, PC4.y) %>%
  cor(use = 'complete.obs')
              PC1.x         PC1.y       PC2.x         PC2.y        PC3.x
PC1.x  1.0000000000 -0.4649240831 -0.01304922 -4.404165e-02 -0.008622842
PC1.y -0.4649240831  1.0000000000 -0.14904816  8.060876e-04 -0.057542402
PC2.x -0.0130492223 -0.1490481596  1.00000000  8.096420e-01 -0.037330583
PC2.y -0.0440416452  0.0008060876  0.80964196  1.000000e+00 -0.237848700
PC3.x -0.0086228423 -0.0575424017 -0.03733058 -2.378487e-01  1.000000000
PC3.y  0.0135072689 -0.0011237671  0.09305470  6.071889e-05  0.059599826
PC4.x -0.0004368284  0.0023706262 -0.01063945 -1.886671e-01 -0.005955651
PC4.y -0.0504609348 -0.0013820450 -0.15810972  1.687007e-04  0.117656933
              PC3.y         PC4.x         PC4.y
PC1.x  1.350727e-02 -0.0004368284 -5.046093e-02
PC1.y -1.123767e-03  0.0023706262 -1.382045e-03
PC2.x  9.305470e-02 -0.0106394542 -1.581097e-01
PC2.y  6.071889e-05 -0.1886670818  1.687007e-04
PC3.x  5.959983e-02 -0.0059556515  1.176569e-01
PC3.y  1.000000e+00  0.1226677827  7.814901e-05
PC4.x  1.226678e-01  1.0000000000 -3.080790e-01
PC4.y  7.814901e-05 -0.3080790330  1.000000e+00

Compare PC scores:

Show the code
tmp.anchored <- as.data.frame(pca.naive.anchored$eigenvect) %>% mutate(sample.id = pca.naive.anchored$sample.id)
tmp <- as.data.frame(pca.naive$eigenvect) %>% mutate(sample.id = pca.naive$sample.id)
pcs.combined <- tmp %>%
  left_join(tmp.anchored, by = 'sample.id')

pcs.combined %>% 
  select(V1.x, V1.y) %>%
  cor(use = 'complete.obs')
           V1.x       V1.y
V1.x  1.0000000 -0.9232392
V1.y -0.9232392  1.0000000
Show the code
pcs.combined %>% 
  select(V2.x, V2.y) %>%
  cor(use = 'complete.obs')
          V2.x      V2.y
V2.x 1.0000000 0.9019153
V2.y 0.9019153 1.0000000
Show the code
pcs.combined %>% 
  select(V3.x, V3.y) %>%
  cor(use = 'complete.obs')
           V3.x       V3.y
V3.x 1.00000000 0.07737893
V3.y 0.07737893 1.00000000
Show the code
pcs.combined %>% 
  select(V4.x, V4.y) %>%
  cor(use = 'complete.obs')
           V4.x       V4.y
V4.x  1.0000000 -0.3552891
V4.y -0.3552891  1.0000000